% This m-script reads in the raw STAR data and define the class/groups
% Date: 2019-04-20

clear; clc; close all;
%% ----------- read data and field names ---------------
% filepath = 'C:\Users\xt9\Box Sync\misclassifiedNetwork\data\';
filepath = 'C:\Users\xt9\Dropbox\Arthur_Xi_Xun\unobserved_links\final submission files\Data\';
filename = [filepath 'STARdata2.csv'];
headSpec = []; dataSpec = [];
for i = 1:65;
    headSpec = [headSpec, '%s '];  dataSpec = [dataSpec, '%f '];
end;
fileID = fopen(filename,'r');
fieldNames = textscan(fileID, headSpec, 1, 'Delimiter', ',');
dataMat = textscan(fileID, dataSpec, 'Delimiter', ',', ...
          'EmptyValue' ,NaN, 'HeaderLines' ,1, 'ReturnOnError', false);
fclose(fileID);  
% [dataMat, fieldNames] = xlsread(filename); %% a much simpler alternative 
for i = 1:size(dataMat,2),
    varname = genvarname([fieldNames{1,i}]);
    eval([varname{:} ' = dataMat{i}(:);']);
end;
clear dataMat fieldNames fileID filename varname headSpec dataSpec i ans;
save([filepath 'STARdata.mat']);  % clear filepath;
%% ---------- construct classes and label students --------------
% partition students in the sample into class
teacherID = unique(g3tchid); % collect unique teacher ID
classSize = zeros(length(teacherID),1); % record class size
majorProp = zeros(length(teacherID),1); % record proportion of white students
dobDistrn = zeros(length(teacherID),1); % record summary statistics in distr'n of student D.O.B.

scoreVars = [];  sizeVar = [];

for i = 1:length(teacherID)  % loop in teacherID (class)  
    stdntIndex{i} = find(g3tchid == teacherID(i)); % extract class members from full data
    classSize(i)  = length(stdntIndex{i}); % record class size
    % if classSize(i) < 13 || classSize(i) > 28
    %    continue; % skip classes that are too small or large
    % end;
    sortStdntDOB = sortrows([stdntIndex{i} birthyear(stdntIndex{i}) birthmonth(stdntIndex{i}) ...
                  birthday(stdntIndex{i})], [2 3 4]); % sort class members by birthday
    sortStdntIndex = sortStdntDOB(:,1);
    classData{i}(:,1) = stdntid(sortStdntIndex); % student ID 
    classData{i}(:,2) = g3tchid(sortStdntIndex); % teacher ID
    classData{i}(:,3) = sortStdntIndex; % student index in full sample 
    classData{i}(:,4) = race(sortStdntIndex); % student race
    majorProp(i) = mean(classData{i}(:,4)== mode(classData{i}(:,4))); % proportion of majority
    for j = 1:classSize(i), % loop in class members (student)
        % construct explantory variable
        % tempInd = sortStdntIndex(j);  % index of this class member (student) in the full sample
        refYear = birthyear(sortStdntIndex(1)); refMonth = birthmonth(sortStdntIndex(1));  % set reference D.O.B. in class
        yearDiff = birthyear(sortStdntIndex(j))-refYear; monthDiff = birthmonth(sortStdntIndex(j))-refMonth;
        switch yearDiff,
            case 0, classData{i}(j,5) = monthDiff;
            otherwise classData{i}(j,5) = 12 - refMonth + birthmonth(sortStdntIndex(j)) + 12*(yearDiff-1);
        end; % 5th column: # of months btw the student's birthday and that of the oldest in the class
    end;
    % dobMat = repmat(classData{i}(:,5),1,classSize(i)); 
    % % dobDistrn: avg # of classmates with similar d.o.b. (within 3 months) 
    % dobDistrn(i) = mean( sum( (abs(dobMat - dobMat') <= 3) , 2 ) > size(dobMat,1)/3 );  
    dobDistrn(i) = std(classData{i}(:,5),'omitnan');  % some classes has NaN values because of missing values in birthday   
    
    % class-level variables 
    classData{i}(:,6) = g3thighdegree(sortStdntIndex);  % teacher's highest degree 
    classData{i}(:,7) = g3tcareer(sortStdntIndex);  % teacher's career level 
    classData{i}(:,8) = g3tyears(sortStdntIndex);  % years of experience of teachers
    classData{i}(:,9) = g3classtype(sortStdntIndex); % class type
    % student demographics
    classData{i}(:,10)= g3absent(sortStdntIndex);  % # of days absent
    classData{i}(:,11)= g3motivraw(sortStdntIndex);  % motivation score
    classData{i}(:,12)= g2mathbsraw(sortStdntIndex);  % grade 2 raw score of math bs
    % dependent variable    
    classData{i}(:,13)= g3mathcomputss(sortStdntIndex);  % math computation score
    % ttemp = g3mathcomputss(sortStdntIndex);  % math computation score
    % classData{i}(:,13) = (ttemp - mean(ttemp))/std(ttemp);        
    classData{i}(:,14)= g3tmathss(sortStdntIndex);  % math scale total score    
    scoreVars = [scoreVars; classData{i}(:,12:14)];  
    sizeVar = [sizeVar; length(sortStdntIndex)*ones(length(sortStdntIndex),1)];
end;

% standardize scores in columns#12
vv = scoreVars(:,1); scoreMean = mean(vv(~isnan(vv))); scoreStd  = std(vv(~isnan(vv)));
sumMat = []; tecMat = []; 
for i = 1:length(teacherID), 
        classData{i}(:,12) = (classData{i}(:,12)-scoreMean)/scoreStd;   
        % stack relevant variables for summary statistics below
        clsz = size(classData{i},1);
        sumMat = [sumMat; clsz*ones(clsz,1), classData{i}(:,[7 8 10 11 12 14])];
        tecMat = [tecMat; clsz, classData{i}(1,[7 8])];
end;

%% trim small and big classes with low frequency in data
lb = 15; ub = 25; classInd  = find( classSize>=lb & classSize<=ub );
classData = classData(classInd);  
classSize = classSize(classInd);
dobDistrn = dobDistrn(classInd);
majorProp = majorProp(classInd); 
save([filepath 'classDataV2.mat'], 'classData', 'classSize', 'dobDistrn', 'majorProp');

%% report summary statistics
cut = 20; 
% summary of 3rd grad math scores
figure(1);
smInd = find((lb<=sumMat(:,1) & sumMat(:,1) <=cut & ~isnan(sumMat(:,7)))); 
lgInd = find((cut<sumMat(:,1) & sumMat(:,1) <=ub & ~isnan(sumMat(:,7)))); 
disp('Summary statistics: g3maths, small');
[mean(sumMat(smInd,7)), median(sumMat(smInd,7)), std(sumMat(smInd,7)), ...
    min(sumMat(smInd,7)), max(sumMat(smInd,7))],
subplot(2,1,1); hist(sumMat(smInd,7),50); axis([500 800 0 220]); 
title(sprintf('histogram in %g small classes',sum(classSize<=cut)));
disp('Summary statistics: g3maths, large');
[mean(sumMat(lgInd,7)), median(sumMat(lgInd,7)), std(sumMat(lgInd,7)), ...
    min(sumMat(lgInd,7)), max(sumMat(lgInd,7))],
subplot(2,1,2); hist(sumMat(lgInd,7),50); axis([500 800 0 220]); 
title(sprintf('histogram in %g large classes',sum(classSize>cut)));
suplabel('G3 math scores (raw)','t');

% summary of 2nd grad math scores
figure(2);
smInd2 = find((lb<=sumMat(:,1) & sumMat(:,1) <=cut & ~isnan(sumMat(:,6)))); 
lgInd2 = find((cut<=sumMat(:,1) & sumMat(:,1) <=ub & ~isnan(sumMat(:,6)))); 
disp('Summary statistics: g2maths, small');
[mean(sumMat(smInd2,6)), median(sumMat(smInd2,6)), std(sumMat(smInd2,6)), ...
    min(sumMat(smInd2,6)), max(sumMat(smInd2,6))],
subplot(2,1,1); hist(sumMat(smInd2,6),50); % axis([500 800 0 220]); 
title(sprintf('histogram in %g small classes',sum(classSize<=cut)));
disp('Summary statistics: g2maths, large');
[mean(sumMat(lgInd2,6)), median(sumMat(lgInd2,6)), std(sumMat(lgInd2,6)), ...
    min(sumMat(lgInd2,6)), max(sumMat(lgInd2,6))],
subplot(2,1,2); hist(sumMat(lgInd2,6),50); % axis([500 800 0 220]); 
title(sprintf('histogram in %g large classes',sum(classSize>cut)));
suplabel('G2 math scores (standardized)','t');

% summary of teacher experience, days of absence, motivation score
sI = find(lb<=sumMat(:,1) & sumMat(:,1) <=cut); 
lI = find(cut<sumMat(:,1) & sumMat(:,1) <=ub); 
SI = find(lb<=tecMat(:,1) & tecMat(:,1) <=cut); 
LI = find(cut<tecMat(:,1) & tecMat(:,1) <=ub); 

disp('Summary statistics: small: absDays, motScore, teacherExp');
[mean(sumMat(sI,4),'omitnan'), median(sumMat(sI,4),'omitnan'), std(sumMat(sI,4),'omitnan'), ...
    min(sumMat(sI,4)), max(sumMat(sI,4))],
[mean(sumMat(sI,5),'omitnan'), median(sumMat(sI,5),'omitnan'), std(sumMat(sI,5),'omitnan'), ...
    min(sumMat(sI,5)), max(sumMat(sI,5))],
[mean(tecMat(SI,3),'omitnan'), median(tecMat(SI,3),'omitnan'), std(tecMat(SI,3),'omitnan'), ...
    min(tecMat(SI,3)), max(tecMat(SI,3))],

disp('Summary statistics: large: absDays, motScore, teacherExp');
[mean(sumMat(lI,4),'omitnan'), median(sumMat(lI,4),'omitnan'), std(sumMat(lI,4),'omitnan'), ...
    min(sumMat(lI,4)), max(sumMat(lI,4))],
[mean(sumMat(lI,5),'omitnan'), median(sumMat(lI,5),'omitnan'), std(sumMat(lI,5),'omitnan'), ...
    min(sumMat(lI,5)), max(sumMat(lI,5))],
[mean(tecMat(LI,3),'omitnan'), median(tecMat(LI,3),'omitnan'), std(tecMat(LI,3),'omitnan'), ...
    min(tecMat(LI,3)), max(tecMat(LI,3))],

disp('t-test for days of absence');
[H4,P4] = ttest2(sumMat(sI,4),sumMat(lI,4),'alpha',.05,'vartype','unequal'), 
disp('t-test for teacher exp');
[H5,P5] = ttest2(tecMat(SI,3),tecMat(LI,3),'alpha',.05,'vartype','unequal'), 

%% test equality of sample means in dep and indep variables
% g3 math grades
[H1,P1] = ttest2(sumMat(smInd,7),sumMat(lgInd,7),'alpha',.05,'vartype','unequal'), 
[mean(sumMat(smInd,7)), mean(sumMat(lgInd,7))],
% g2 math grades
[H2,P2] = ttest2(sumMat(smInd2,6),sumMat(lgInd2,6),'alpha',.05,'vartype','unequal'), 
[mean(sumMat(smInd2,6)), mean(sumMat(lgInd2,6))],
% motivation scores
smInd3 = find(( lb<=sumMat(:,1)  & sumMat(:,1)  <=cut & ~isnan(sumMat(:,5)))); 
lgInd3 = find((cut<=sumMat(:,1)  & sumMat(:,1)  <=ub  & ~isnan(sumMat(:,5)))); 
disp('t-test for motivation score');
[H3,P3] = ttest2(sumMat(smInd3,5),sumMat(lgInd3,5),'alpha',.05,'vartype','unequal'), 
[mean(sumMat(smInd3,5)), mean(sumMat(lgInd3,5))],

% clear sumMat;

% gd3maths = scoreVars(:,3); 
% % histogram of 3rd grad math scores
% figure(1);
% smInd = find((lb<=sizeVar & sizeVar <=cut & ~isnan(gd3maths))); 
% lgInd = find((cut<=sizeVar & sizeVar <=ub & ~isnan(gd3maths))); 
% subplot(2,1,1); hist(gd3maths(smInd),50); axis([500 800 0 220]); 
% title(sprintf('histogram in %g small classes',sum(classSize<=cut)));
% subplot(2,1,2); hist(gd3maths(lgInd),50); axis([500 800 0 220]); 
% title(sprintf('histogram in %g large classes',sum(classSize>cut)));
% suplabel('G3 math scores (raw)','t');
% 
% % histogram of 2nd grad math scores
% figure(2);
% gd2maths = (scoreVars(:,1)-scoreMean)/scoreStd;  
% smInd2 = find((lb<=sizeVar & sizeVar <=cut & ~isnan(gd2maths))); 
% lgInd2 = find((cut<=sizeVar & sizeVar <=ub & ~isnan(gd2maths))); 
% subplot(2,1,1); hist(gd2maths(smInd2),50); % axis([500 800 0 220]); 
% title(sprintf('histogram in %g small classes',sum(classSize<=cut)));
% subplot(2,1,2); hist(gd2maths(lgInd2),50); % axis([500 800 0 220]); 
% title(sprintf('histogram in %g large classes',sum(classSize>cut)));
% suplabel('G2 math scores (standardized)','t');
% 
% %% test equality of sample means in dep and indep variables
% % g3 math grades
% [H1,P1] = ttest2(gd3maths(smInd),gd3maths(lgInd),'alpha',.05,'vartype','unequal'), 
% [mean(gd3maths(smInd)), mean(gd3maths(lgInd))],
% % g2 math grades
% [H2,P2] = ttest2(gd2maths(smInd2),gd2maths(lgInd2),'alpha',.05,'vartype','unequal'), 
% [mean(gd2maths(smInd2)), mean(gd2maths(lgInd2))],
% % motivation scores
% smInd3 = find((lb<=sizeVar & sizeVar <=cut & ~isnan(g3motivraw))); 
% lgInd3 = find((cut<=sizeVar & sizeVar <=ub & ~isnan(g3motivraw))); 
% [H3,P3] = ttest2(g3motivraw(smInd3),g3motivraw(lgInd3),'alpha',.05,'vartype','unequal'), 
% [mean(g3motivraw(smInd3)), mean(g3motivraw(lgInd3))],
 
% figure(2);
% [fs,xs] = ksdensity(gd3maths(smInd),'support', [450 750], 'function', 'pdf', 'kernel', 'epanechnikov'); 
% [fl,xl] = ksdensity(gd3maths(lgInd),'support', [450 750], 'function', 'pdf', 'kernel', 'epanechnikov'); 
% hold on; plot(xs,fs,'k--');  plot(xl,fl,'k-');  hold off; 
% clear sizeVar scoreVars;